library(dplyr)
library(tidyr)
library(stringr)
library(ggplot2)
library(ggpubr)
library(ggrepel)

if (!file.exists("output")) {
  dir.create("output")
}

# load data
source("HNC_metadata_tidy.R")

countries <- c("Argentina", "Brazil", "Colombia", "Czechia", "Greece", "Italy", "Romania", "Slovakia")

# ASR incidence data from GLOBOCAN 2022 https://gco.iarc.fr/
# ASR incidence in males and females from oral cavity, oropharynx, hypopharynx, and larynx cases
files <- list.files("GLOBOCAN_2022/", pattern = "ASR")

# PANEL A - Estimate of tobacco smoking prevalence (ASR)

# Estimate of current tobacco smoking prevalence (%) (age-standardized rate)
# Data obtained from WHO Global Health Observatory (2019) https://apps.who.int/gho/data/view.main.TOBAGESTDCURRv
GHO_smoking <- read.csv("Smoking_per_country_WHO_GHO.csv") %>%
  filter(Location %in% countries, Period == "2019", grepl("tobacco smoking", Indicator)) %>%
  select(ParentLocation, Location, Period, Dim1, Value) %>%
  mutate(Location = ifelse(Location == "Czechia", "Czech Republic", Location)) %>%
  filter(Dim1 != "Both sexes")

asr_loc <- NULL
for (f in files) {
  asr_f <- read.csv(paste0("GLOBOCAN_2022/", f), row.names = NULL) %>%
    select("Label", "ASR..World.") %>%
    rename(Population = `Label`, ASR = `ASR..World.`) %>%
    filter(Population %in% countries) %>%
    mutate(
      Population = ifelse(Population == "Czechia", "Czech Republic", Population),
      subsite = str_split_1(f, "[_.]")[2],
      sex = str_split_1(f, "[_.]")[3]
    )
  asr_loc <- rbind(asr_loc, asr_f)
}

asr_loc <- asr_loc %>%
  spread(subsite, ASR) %>%
  mutate(
    ASR = Hypopharynx + Larynx + OC + OPC,
    code = paste(Population, paste0(sex, "s")),
    .keep = "unused"
  )

GHO_smoking <- GHO_smoking %>%
  mutate(code = paste(Location, paste0(Dim1, "s"))) %>%
  left_join(asr_loc, by = "code")

# Run model
model <- lm("ASR ~ Value", data = GHO_smoking)
result <- summary(model)$coefficients

(panel_a <- ggplot(GHO_smoking, aes(x = ASR, y = Value)) +
  geom_smooth(method = "lm", colour = "darkblue", fill = "lightblue", size = 0.5) +
  geom_point(alpha = 0.5, stroke = 0, size = 2) +
  geom_text_repel(label = GHO_smoking$code, size = 3.5, max.overlaps = 3) +
  labs(
    title = "Estimate of tobacco smoking prevalence (ASR)",
    subtitle = paste("p =", round(result[2, 4], 4)),
    x = "Incidence (ASR)", y = "Tobacco smoking (ASR)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 12),
    plot.subtitle = element_text(size = 10),
    axis.text = element_text(size = 12),
    axis.line = element_line(size = .3,color = "#404040"),
    axis.ticks = element_line(size = .3,color = "#404040"),
    panel.grid = element_blank()
  )
)

# ggsave(plot = panel_a,
#        filename = paste0("output/Figure_4a_",Sys.Date(),".pdf"),
#        device = "pdf" , width = 6, height = 4.5, units = "in")

# PANEL B - Cigarette quantity associations across countries in the MUTOGRAPHS-HNC dataset

asr_sub <- NULL
for (f in files) {
  asr_f <- read.csv(paste0("GLOBOCAN_2022/", f), row.names = NULL) %>%
    select("Label", "ASR..World.") %>%
    rename(Population = `Label`, ASR = `ASR..World.`) %>%
    filter(Population %in% countries) %>%
    mutate(
      Population = ifelse(Population == "Czechia", "Czech Republic", Population),
      subsite = str_split_1(f, "[_.]")[2],
      sex = str_split_1(f, "[_.]")[3],
      code = paste(Population, subsite, paste0(sex, "s"))
    )
  asr_sub <- rbind(asr_sub, asr_f)
}

data <- data %>%
  mutate(
    cigqty = ifelse(tobacco_ever == "No", 0, cigqty),
    code = paste(country, subsite, paste0(sex, "s"))
  ) %>%
  left_join(asr_sub[c("code", "ASR")], by = "code")

confounders <- "age_group"
cig_var <- "cigqty"

myformula <- paste0(cig_var, " ~ ", paste(c("ASR", confounders), collapse = " + "))
print(myformula)
model <- lm(myformula, data = data)
result <- summary(model)$coefficients
pval <- result[rownames(result) == "ASR"][4]
country_avg <- data %>%
  group_by(code) %>%
  summarize(n = n(), cig_avg = mean(get(cig_var), na.rm = T), ASR = median(ASR))

(panel_b <- ggplot(data, aes(x = ASR, y = get(cig_var))) +
  geom_smooth(method = lm, colour = "darkblue", fill = "lightblue", size = 0.5) +
  geom_point(data = country_avg, aes(x = ASR, y = cig_avg, size = n), alpha = 0.5, stroke = 0) +
  geom_text_repel(data = country_avg, aes(x = ASR, y = cig_avg), label = country_avg$code, size = 3.5, max.overlaps = 5) +
  labs(
    title = paste0("ASR/cigarette quantity associations across countries"),
    subtitle = paste("p =", formatC(pval, 4, format = "e", digits = 2)),
    x = "Incidence (ASR)", y = "Average cigarettes per day", size = "Number \n of samples"
  ) +
  theme_minimal() +
  theme(
    theme(legend.position = "right"),
    plot.title = element_text(face = "bold", size = 12),
    plot.subtitle = element_text(size = 10),
    axis.text = element_text(size = 12),
    axis.line = element_line(size = .3,color = "#404040"),
    axis.ticks = element_line(size = .3,color = "#404040"),
    panel.grid = element_blank()
  )
)

# ggsave(plot = panel_b,
#       filename = paste0("output/Figure_4b_",cig_var,"_",Sys.Date(),".pdf"),
#       device = "pdf" , width = 6, height = 4.5, units = "in")

plots <- ggarrange(panel_a, panel_b, nrow = 1, align = "h", labels = c("a", "b"), widths = c(1, 1.13))

ggsave(
  plot = plots,
  filename = paste0("output/Figure_4a-b_", cig_var, "_", Sys.Date(), ".pdf"),
  device = "pdf", width = 12.5, height = 4.5, units = "in"
)
